# -*- coding: utf-8 -*-
import scrapy


class A91wenmiSpider(scrapy.Spider):
    name = '91wenmi'
    number=10 #滚动新闻可选择的页码数

    def start_requests(self):
        urls = []
        urls.append("http://www.91wenmi.com/wenmi/dangjian/sixianghuibao/")
        for i in range(2,self.number):
            urls.append('http://www.91wenmi.com/wenmi/dangjian/sixianghuibao/index_{0}.html'.format(i+1)) 
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        '''
        提取到新闻正文的链接
        '''
        for href in response.css('div.mainL dl dt a::attr(href)').getall():
            yield response.follow(href,self.parse_news)


    def parse_news(self,response):
        '''
        提取新闻正文的文本内容
        '''
        texts=response.css('div#content::text').getall()
        article=''.join(texts)
        if(len(article)<30):#太短,说明内容不在content而在p元素
            texts=response.css('div#content p::text').getall()
            article=''.join(texts)
        yield {'article':article}
